/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.analysis.wikipedia;

import static org.apache.lucene.analysis.wikipedia.WikipediaTokenizer.BOLD;
import static org.apache.lucene.analysis.wikipedia.WikipediaTokenizer.BOLD_ITALICS;
import static org.apache.lucene.analysis.wikipedia.WikipediaTokenizer.CATEGORY;
import static org.apache.lucene.analysis.wikipedia.WikipediaTokenizer.CITATION;
import static org.apache.lucene.analysis.wikipedia.WikipediaTokenizer.EXTERNAL_LINK;
import static org.apache.lucene.analysis.wikipedia.WikipediaTokenizer.EXTERNAL_LINK_URL;
import static org.apache.lucene.analysis.wikipedia.WikipediaTokenizer.HEADING;
import static org.apache.lucene.analysis.wikipedia.WikipediaTokenizer.INTERNAL_LINK;
import static org.apache.lucene.analysis.wikipedia.WikipediaTokenizer.ITALICS;
import static org.apache.lucene.analysis.wikipedia.WikipediaTokenizer.SUB_HEADING;
import static org.apache.lucene.analysis.wikipedia.WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG;

import java.io.IOException;
import java.io.StringReader;
import java.util.Collections;
import java.util.HashSet;
import java.util.Random;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;

/** Basic Tests for {@link WikipediaTokenizer} */
public class TestWikipediaTokenizer extends BaseTokenStreamTestCase {
  protected static final String LINK_PHRASES =
      "click [[link here again]] click [http://lucene.apache.org here again] [[Category:a b c d]]";

  public void testSimple() throws Exception {
    String text = "This is a [[Category:foo]]";
    WikipediaTokenizer tf =
        new WikipediaTokenizer(
            newAttributeFactory(), WikipediaTokenizer.TOKENS_ONLY, Collections.<String>emptySet());
    tf.setReader(new StringReader(text));
    assertTokenStreamContents(
        tf,
        new String[] {"This", "is", "a", "foo"},
        new int[] {0, 5, 8, 21},
        new int[] {4, 7, 9, 24},
        new String[] {"<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", CATEGORY},
        new int[] {
          1, 1, 1, 1,
        },
        text.length());
  }

  public void testHandwritten() throws Exception {
    // make sure all tokens are in only one type
    String test =
        "[[link]] This is a [[Category:foo]] Category  This is a linked [[:Category:bar none withstanding]] "
            + "Category This is (parens) This is a [[link]]  This is an external URL [http://lucene.apache.org] "
            + "Here is ''italics'' and ''more italics'', '''bold''' and '''''five quotes''''' "
            + " This is a [[link|display info]]  This is a period.  Here is $3.25 and here is 3.50.  Here's Johnny.  "
            + "==heading== ===sub head=== followed by some text  [[Category:blah| ]] "
            + "''[[Category:ital_cat]]''  here is some that is ''italics [[Category:foo]] but is never closed."
            + "'''same [[Category:foo]] goes for this '''''and2 [[Category:foo]] and this"
            + " [http://foo.boo.com/test/test/ Test Test] [http://foo.boo.com/test/test/test.html Test Test]"
            + " [http://foo.boo.com/test/test/test.html?g=b&c=d Test Test] <ref>Citation</ref> <sup>martian</sup> <span class=\"glue\">code</span>";

    WikipediaTokenizer tf =
        new WikipediaTokenizer(
            newAttributeFactory(), WikipediaTokenizer.TOKENS_ONLY, Collections.<String>emptySet());
    tf.setReader(new StringReader(test));
    assertTokenStreamContents(
        tf,
        new String[] {
          "link",
          "This",
          "is",
          "a",
          "foo",
          "Category",
          "This",
          "is",
          "a",
          "linked",
          "bar",
          "none",
          "withstanding",
          "Category",
          "This",
          "is",
          "parens",
          "This",
          "is",
          "a",
          "link",
          "This",
          "is",
          "an",
          "external",
          "URL",
          "http://lucene.apache.org",
          "Here",
          "is",
          "italics",
          "and",
          "more",
          "italics",
          "bold",
          "and",
          "five",
          "quotes",
          "This",
          "is",
          "a",
          "link",
          "display",
          "info",
          "This",
          "is",
          "a",
          "period",
          "Here",
          "is",
          "3.25",
          "and",
          "here",
          "is",
          "3.50",
          "Here's",
          "Johnny",
          "heading",
          "sub",
          "head",
          "followed",
          "by",
          "some",
          "text",
          "blah",
          "ital",
          "cat",
          "here",
          "is",
          "some",
          "that",
          "is",
          "italics",
          "foo",
          "but",
          "is",
          "never",
          "closed",
          "same",
          "foo",
          "goes",
          "for",
          "this",
          "and2",
          "foo",
          "and",
          "this",
          "http://foo.boo.com/test/test/",
          "Test",
          "Test",
          "http://foo.boo.com/test/test/test.html",
          "Test",
          "Test",
          "http://foo.boo.com/test/test/test.html?g=b&c=d",
          "Test",
          "Test",
          "Citation",
          "martian",
          "code"
        },
        new String[] {
          INTERNAL_LINK,
          "<ALPHANUM>",
          "<ALPHANUM>",
          "<ALPHANUM>",
          CATEGORY,
          "<ALPHANUM>",
          "<ALPHANUM>",
          "<ALPHANUM>",
          "<ALPHANUM>",
          "<ALPHANUM>",
          CATEGORY,
          CATEGORY,
          CATEGORY,
          "<ALPHANUM>",
          "<ALPHANUM>",
          "<ALPHANUM>",
          "<ALPHANUM>",
          "<ALPHANUM>",
          "<ALPHANUM>",
          "<ALPHANUM>",
          INTERNAL_LINK,
          "<ALPHANUM>",
          "<ALPHANUM>",
          "<ALPHANUM>",
          "<ALPHANUM>",
          "<ALPHANUM>",
          EXTERNAL_LINK_URL,
          "<ALPHANUM>",
          "<ALPHANUM>",
          ITALICS,
          "<ALPHANUM>",
          ITALICS,
          ITALICS,
          BOLD,
          "<ALPHANUM>",
          BOLD_ITALICS,
          BOLD_ITALICS,
          "<ALPHANUM>",
          "<ALPHANUM>",
          "<ALPHANUM>",
          INTERNAL_LINK,
          INTERNAL_LINK,
          INTERNAL_LINK,
          "<ALPHANUM>",
          "<ALPHANUM>",
          "<ALPHANUM>",
          "<ALPHANUM>",
          "<ALPHANUM>",
          "<ALPHANUM>",
          "<NUM>",
          "<ALPHANUM>",
          "<ALPHANUM>",
          "<ALPHANUM>",
          "<NUM>",
          "<APOSTROPHE>",
          "<ALPHANUM>",
          HEADING,
          SUB_HEADING,
          SUB_HEADING,
          "<ALPHANUM>",
          "<ALPHANUM>",
          "<ALPHANUM>",
          "<ALPHANUM>",
          CATEGORY,
          CATEGORY,
          CATEGORY,
          "<ALPHANUM>",
          "<ALPHANUM>",
          "<ALPHANUM>",
          "<ALPHANUM>",
          "<ALPHANUM>",
          ITALICS,
          CATEGORY,
          "<ALPHANUM>",
          "<ALPHANUM>",
          "<ALPHANUM>",
          "<ALPHANUM>",
          BOLD,
          CATEGORY,
          "<ALPHANUM>",
          "<ALPHANUM>",
          "<ALPHANUM>",
          BOLD_ITALICS,
          CATEGORY,
          "<ALPHANUM>",
          "<ALPHANUM>",
          EXTERNAL_LINK_URL,
          EXTERNAL_LINK,
          EXTERNAL_LINK,
          EXTERNAL_LINK_URL,
          EXTERNAL_LINK,
          EXTERNAL_LINK,
          EXTERNAL_LINK_URL,
          EXTERNAL_LINK,
          EXTERNAL_LINK,
          CITATION,
          "<ALPHANUM>",
          "<ALPHANUM>"
        });
  }

  public void testLinkPhrases() throws Exception {
    WikipediaTokenizer tf =
        new WikipediaTokenizer(
            newAttributeFactory(), WikipediaTokenizer.TOKENS_ONLY, Collections.<String>emptySet());
    tf.setReader(new StringReader(LINK_PHRASES));
    checkLinkPhrases(tf);
  }

  private void checkLinkPhrases(WikipediaTokenizer tf) throws IOException {
    assertTokenStreamContents(
        tf,
        new String[] {
          "click",
          "link",
          "here",
          "again",
          "click",
          "http://lucene.apache.org",
          "here",
          "again",
          "a",
          "b",
          "c",
          "d"
        },
        new int[] {1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1});
  }

  public void testLinks() throws Exception {
    String test =
        "[http://lucene.apache.org/java/docs/index.html#news here] [http://lucene.apache.org/java/docs/index.html?b=c here] [https://lucene.apache.org/java/docs/index.html?b=c here]";
    WikipediaTokenizer tf =
        new WikipediaTokenizer(
            newAttributeFactory(), WikipediaTokenizer.TOKENS_ONLY, Collections.<String>emptySet());
    tf.setReader(new StringReader(test));
    assertTokenStreamContents(
        tf,
        new String[] {
          "http://lucene.apache.org/java/docs/index.html#news",
          "here",
          "http://lucene.apache.org/java/docs/index.html?b=c",
          "here",
          "https://lucene.apache.org/java/docs/index.html?b=c",
          "here"
        },
        new String[] {
          EXTERNAL_LINK_URL,
          EXTERNAL_LINK,
          EXTERNAL_LINK_URL,
          EXTERNAL_LINK,
          EXTERNAL_LINK_URL,
          EXTERNAL_LINK,
        });
  }

  public void testLucene1133() throws Exception {
    Set<String> untoks = new HashSet<>();
    untoks.add(WikipediaTokenizer.CATEGORY);
    untoks.add(WikipediaTokenizer.ITALICS);
    // should be exactly the same, regardless of untoks
    WikipediaTokenizer tf =
        new WikipediaTokenizer(newAttributeFactory(), WikipediaTokenizer.TOKENS_ONLY, untoks);
    tf.setReader(new StringReader(LINK_PHRASES));
    checkLinkPhrases(tf);
    String test =
        "[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h   i   j]]";
    tf = new WikipediaTokenizer(WikipediaTokenizer.UNTOKENIZED_ONLY, untoks);
    tf.setReader(new StringReader(test));
    assertTokenStreamContents(
        tf,
        new String[] {
          "a b c d",
          "e f g",
          "link",
          "here",
          "link",
          "there",
          "italics here",
          "something",
          "more italics",
          "h   i   j"
        },
        new int[] {11, 32, 42, 47, 56, 61, 71, 86, 98, 124},
        new int[] {18, 37, 46, 51, 60, 66, 83, 95, 110, 133},
        new int[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
  }

  public void testBoth() throws Exception {
    Set<String> untoks = new HashSet<>();
    untoks.add(WikipediaTokenizer.CATEGORY);
    untoks.add(WikipediaTokenizer.ITALICS);
    String test =
        "[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h   i   j]]";
    // should output all the indivual tokens plus the untokenized tokens as well.  Untokenized
    // tokens
    WikipediaTokenizer tf =
        new WikipediaTokenizer(newAttributeFactory(), WikipediaTokenizer.BOTH, untoks);
    tf.setReader(new StringReader(test));
    assertTokenStreamContents(
        tf,
        new String[] {
          "a b c d",
          "a",
          "b",
          "c",
          "d",
          "e f g",
          "e",
          "f",
          "g",
          "link",
          "here",
          "link",
          "there",
          "italics here",
          "italics",
          "here",
          "something",
          "more italics",
          "more",
          "italics",
          "h   i   j",
          "h",
          "i",
          "j"
        },
        new int[] {
          11, 11, 13, 15, 17, 32, 32, 34, 36, 42, 47, 56, 61, 71, 71, 79, 86, 98, 98, 103, 124, 124,
          128, 132
        },
        new int[] {
          18, 12, 14, 16, 18, 37, 33, 35, 37, 46, 51, 60, 66, 83, 78, 83, 95, 110, 102, 110, 133,
          125, 129, 133
        },
        new int[] {1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1});

    // now check the flags, TODO: add way to check flags from BaseTokenStreamTestCase?
    tf = new WikipediaTokenizer(newAttributeFactory(), WikipediaTokenizer.BOTH, untoks);
    tf.setReader(new StringReader(test));
    int[] expectedFlags =
        new int[] {
          UNTOKENIZED_TOKEN_FLAG,
          0,
          0,
          0,
          0,
          UNTOKENIZED_TOKEN_FLAG,
          0,
          0,
          0,
          0,
          0,
          0,
          0,
          UNTOKENIZED_TOKEN_FLAG,
          0,
          0,
          0,
          UNTOKENIZED_TOKEN_FLAG,
          0,
          0,
          UNTOKENIZED_TOKEN_FLAG,
          0,
          0,
          0
        };
    FlagsAttribute flagsAtt = tf.addAttribute(FlagsAttribute.class);
    tf.reset();
    for (int i = 0; i < expectedFlags.length; i++) {
      assertTrue(tf.incrementToken());
      assertEquals("flags " + i, expectedFlags[i], flagsAtt.getFlags());
    }
    assertFalse(tf.incrementToken());
    tf.close();
  }

  /** blast some random strings through the analyzer */
  public void testRandomStrings() throws Exception {
    Analyzer a =
        new Analyzer() {

          @Override
          protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer =
                new WikipediaTokenizer(
                    newAttributeFactory(),
                    WikipediaTokenizer.TOKENS_ONLY,
                    Collections.<String>emptySet());
            return new TokenStreamComponents(tokenizer, tokenizer);
          }
        };
    // TODO: properly support positionLengthAttribute
    checkRandomData(random(), a, 200 * RANDOM_MULTIPLIER, 20, false, false);
    a.close();
  }

  /** blast some random large strings through the analyzer */
  public void testRandomHugeStrings() throws Exception {
    Random random = random();
    Analyzer a =
        new Analyzer() {

          @Override
          protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer =
                new WikipediaTokenizer(
                    newAttributeFactory(),
                    WikipediaTokenizer.TOKENS_ONLY,
                    Collections.<String>emptySet());
            return new TokenStreamComponents(tokenizer, tokenizer);
          }
        };
    // TODO: properly support positionLengthAttribute
    checkRandomData(random, a, 10 * RANDOM_MULTIPLIER, 8192, false, false);
    a.close();
  }
}
